// vim: set syntax=asm :

{{L}}{{label}}:
    mov             rax, [ rdi + 8 ]

{% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%}
{% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1}}{%endcapture%}

{% if type == "f16" %}
    {% for ix in (0..mr_over_8_min_1) %}
        vmovups         xmm{{to | plus: 1 | plus: ix}},  [rax + {{ix | times: 16}}]
    {% endfor %}
    {% for ix in (0..mr_over_8_min_1) %}
        vcvtph2ps       ymm{{to | plus: 1 | plus: ix}}, xmm{{to | plus: 1 | plus: ix}}
    {% endfor %}
{% else %}
    {% for ix in (0..mr_over_8_min_1) %}
        vmovups         ymm{{to | plus: 1 | plus: ix}},  [rax + {{ix | times: 32}}]
    {% endfor %}
{% endif %}

{% if flipped %}
    {% for acc in (from..to) %}
        {{op}} ymm{{acc}}, ymm{{acc}}, ymm{{ acc | modulo: mr_over_8 | plus: to | plus: 1 }}
    {% endfor %}
{% else %}
    {% for acc in (from..to) %}
        {{op}} ymm{{acc}}, ymm{{ acc | modulo: mr_over_8 | plus: to | plus: 1 }}, ymm{{acc}}
    {% endfor %}
{% endif %}

    jmp {{L}}non_linear_loop
